#data = pd.read_csv('NOVAIMS_MAA_2020e21_BusinessCasesDataScience_MindOverData_RetailChallenge.csv')
#data.head()
#data['Measures'].value_counts()
#len(data['ProductFamily_ID'].unique())
#len(data['ProductCategory_ID'].unique())
#len(data['ProductBrand_ID'].unique())
#len(data['ProductName_ID'].unique())
#len(data['ProductPackSKU_ID'].unique())
#len(data['Point-of-Sale_ID'].unique())
#data.info()
#data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d', errors='ignore')
#data.info()
#unit_df = data[data['Measures']=='Sell-out units'].copy()
#value_df = data[data['Measures']=='Sell-out values'].copy()
#df_units = data[data["Measures"] == "Sell-out units"]
#df_value = data[data["Measures"] == "Sell-out values"]
#df_value["Units"] = df_units["Value"].values
#data = df_value.drop("Measures", axis=1)
#data['ProductFamily_ID'] = data['ProductFamily_ID'].str.split('_').str[1]
#data['ProductCategory_ID'] = data['ProductCategory_ID'].str.split('_').str[1]
#data['ProductBrand_ID'] = data['ProductBrand_ID'].str.split('_').str[1]
#data['ProductName_ID'] = data['ProductName_ID'].str.split('_').str[1]
#data['ProductPackSKU_ID'] = data['ProductPackSKU_ID'].str.split('_').str[1]
#data.head()
#data['Point-of-Sale_ID'] = data['Point-of-Sale_ID'].str.split('_').str[1]
#data['Date'] = pd.to_datetime(data['Date'])
#data['Day of the Week'] = data['Date'].dt.dayofweek
#data['Quarter'] = data['Date'].dt.quarter
#data.to_csv('data_preprocess.csv')
import pandas as pd
df = pd.read_csv('C:/Users/migue/Desktop/Datasets/data_preprocess.csv')
df.shape
(91171152, 12)
df["Point-of-Sale_ID"].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286,
287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
404, 405, 406, 407, 408, 409, 410], dtype=int64)
df["Quarter"].unique()
array([1, 2, 4, 3], dtype=int64)
df["Point-of-Sale_ID"].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286,
287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
404, 405, 406, 407, 408, 409, 410], dtype=int64)
df["ProductFamily_ID"].unique()
array([16, 6, 1, 12, 15, 9, 14, 13, 10, 2, 7, 11, 20, 8, 3, 17, 21,
18, 19, 5, 4], dtype=int64)
df.head()
| Unnamed: 0 | ProductFamily_ID | ProductCategory_ID | ProductBrand_ID | ProductName_ID | ProductPackSKU_ID | Point-of-Sale_ID | Date | Value | Day of the Week | Quarter | Units | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 16 | 11 | 306 | 649 | 1970 | 1 | 2017-03-04 | 1540.0 | 5 | 1 | 2.0 |
| 1 | 3 | 16 | 11 | 306 | 649 | 1970 | 1 | 2016-05-02 | 3080.0 | 0 | 2 | 4.0 |
| 2 | 5 | 16 | 11 | 306 | 649 | 1970 | 1 | 2016-10-24 | 1540.0 | 0 | 4 | 2.0 |
| 3 | 9 | 16 | 11 | 306 | 649 | 1970 | 1 | 2017-10-13 | 1620.0 | 4 | 4 | 2.0 |
| 4 | 10 | 16 | 11 | 306 | 649 | 1970 | 1 | 2017-10-14 | 1620.0 | 5 | 4 | 2.0 |
Drop rows that aren't relevant to perform Data Visualization
df = df.drop(["Unnamed: 0", "ProductBrand_ID", "ProductName_ID", "ProductPackSKU_ID"], axis=1)
df = df.drop(["Day of the Week"], axis=1)
Slicing the Dataset, in order to make visualizations faster to perform
#eda_dash = df[(df['Year'].isin([2016,2017,2018,2019]))&(df['Quarter'].isin([1,2,3,4]))&(df['Point-of-Sale_ID'].isin([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]))]
#eda_dash.info()
#Selecting Point of Sale
df_pos = df.loc[df['Point-of-Sale_ID'].isin([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])]
df_pos["Year"]= df_pos["Date"].str.split("-").str[0]
df_pos['Date'] = pd.to_datetime(df_pos['Date'])
#eda_dash = df_pos
#eda_dash.to_csv("eda_dash.csv", index=False)
<ipython-input-41-11edff36ab14>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_pos["Year"]= df_pos["Date"].str.split("-").str[0]
<ipython-input-41-11edff36ab14>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_pos['Date'] = pd.to_datetime(df_pos['Date'])
df_pos['Date'].iloc[0]
Timestamp('2017-03-04 00:00:00')
df_pos.head()
| ProductFamily_ID | ProductCategory_ID | Point-of-Sale_ID | Date | Value | Quarter | Units | Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | 16 | 11 | 1 | 2017-03-04 | 1540.0 | 1 | 2.0 | 2017 |
| 1 | 16 | 11 | 1 | 2016-05-02 | 3080.0 | 2 | 4.0 | 2016 |
| 2 | 16 | 11 | 1 | 2016-10-24 | 1540.0 | 4 | 2.0 | 2016 |
| 3 | 16 | 11 | 1 | 2017-10-13 | 1620.0 | 4 | 2.0 | 2017 |
| 4 | 16 | 11 | 1 | 2017-10-14 | 1620.0 | 4 | 2.0 | 2017 |
type(df_pos['Quarter'][0])
numpy.int64
type(df_pos['Year'][0])
str
Perform filtering option that will be selected by the the dash callback function in the dashboard
#Selecting quarter and year
quarter = 3
year = "2016"
pos = 1
df_pos = df_pos[(df_pos['Quarter'] == quarter) & (df_pos["Year"] == year) & (df_pos['Point-of-Sale_ID'] == pos)]
df_pos
| ProductFamily_ID | ProductCategory_ID | Point-of-Sale_ID | Date | Value | Quarter | Units | Year | |
|---|---|---|---|---|---|---|---|---|
| 16 | 16 | 11 | 1 | 2016-07-01 | 1540.0 | 3 | 2.0 | 2016 |
| 17 | 16 | 11 | 1 | 2016-07-07 | 1540.0 | 3 | 2.0 | 2016 |
| 25 | 16 | 11 | 1 | 2016-07-21 | 1540.0 | 3 | 2.0 | 2016 |
| 69 | 6 | 178 | 1 | 2016-08-10 | 1782.0 | 3 | 2.0 | 2016 |
| 72 | 6 | 178 | 1 | 2016-09-29 | 1782.0 | 3 | 2.0 | 2016 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190385 | 4 | 34 | 1 | 2016-08-17 | 1220.0 | 3 | 2.0 | 2016 |
| 190386 | 4 | 34 | 1 | 2016-08-25 | 2440.0 | 3 | 4.0 | 2016 |
| 190453 | 4 | 34 | 1 | 2016-07-24 | 1220.0 | 3 | 2.0 | 2016 |
| 190454 | 4 | 34 | 1 | 2016-07-27 | 1220.0 | 3 | 2.0 | 2016 |
| 190455 | 4 | 34 | 1 | 2016-07-28 | 1450.0 | 3 | 2.0 | 2016 |
12155 rows × 8 columns
Convert Date columns to Datetime
#tem que ser para o df todo
df_pos['Date']= pd.to_datetime(df_pos['Date'])
df_pos["Year"].unique()
array(['2016'], dtype=object)
df_1 = df_pos.groupby(df_pos['Date']).sum()
df_1
| ProductFamily_ID | ProductCategory_ID | Point-of-Sale_ID | Value | Quarter | Units | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2016-07-01 | 1533 | 23016 | 152 | 747967.000000 | 456 | 524.0 |
| 2016-07-02 | 1314 | 17192 | 118 | 464443.000000 | 354 | 324.0 |
| 2016-07-03 | 485 | 6305 | 40 | 153591.000000 | 120 | 91.0 |
| 2016-07-04 | 1825 | 27193 | 176 | 959309.000000 | 528 | 606.0 |
| 2016-07-05 | 1493 | 19655 | 132 | 801186.000000 | 396 | 454.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 2016-09-26 | 1071 | 17121 | 116 | 653040.000000 | 348 | 434.0 |
| 2016-09-27 | 1304 | 19002 | 134 | 672074.000000 | 402 | 426.0 |
| 2016-09-28 | 1465 | 20603 | 143 | 745724.000000 | 429 | 481.0 |
| 2016-09-29 | 1236 | 17123 | 129 | 687659.000000 | 387 | 444.0 |
| 2016-09-30 | 1459 | 18345 | 132 | 661477.333333 | 396 | 436.0 |
92 rows × 6 columns
#df = pd.read_csv("values_units.csv", nrows=1000000)
#df_test.to_csv("teste.csv")
Bar Charts that shows either the Value in Millions or the number of units sold for the selected Point of Sale, Quarter and Year
#For Point of Sale 1 with quarter = and year =
import plotly.express as px
fig = px.bar(df_1, x=df_1.index, y="Value", title='Value in Millions for the ' + str(quarter) + " quarter of the year " + str(year))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.show()
Bar Charts that shows either the Value in Millions or the number of units sold for the selected Point of Sale, Quarter and Year
#For Point of Sale 1 with quarter = and year =
fig = px.bar(df_1, x=df_1.index, y="Units", title='Units sold for the ' + str(quarter) + " quarter of the year " + str(year))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.show()
Bar Charts that shows either the Value in Millions or the number of units sold for the selected Point of Sale, Quarter and Year
fig = px.bar(df_1, x=df_1.index, y="Units", color='Value')
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.show()
df_2 = df_pos.groupby(df_pos['ProductFamily_ID']).sum()
df_2.sort_values(by="Units")
df_2.reset_index()
df_2.shape
(19, 5)
df_2.head()
| ProductCategory_ID | Point-of-Sale_ID | Value | Quarter | Units | |
|---|---|---|---|---|---|
| ProductFamily_ID | |||||
| 1 | 381159 | 2178 | 5.524699e+06 | 6534 | 7155.0 |
| 2 | 116590 | 655 | 1.210329e+07 | 1965 | 2668.0 |
| 3 | 14596 | 82 | 1.716251e+06 | 246 | 377.0 |
| 4 | 374 | 11 | 1.515000e+04 | 33 | 24.0 |
| 5 | 18512 | 104 | 3.004990e+05 | 312 | 648.0 |
Bar charts that show either the value or the number of units sold for each one of the ProductFamiliyID's
fig = px.bar(df_2, x=df_2.index, y="Units", title='Units sold for the ' + str(quarter) + " quarter of the year " + str(year))
fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.show()
Bar charts that show either the value or the number of units sold for each one of the ProductFamiliyID's
fig2 = px.bar(df_2, x=df_2.index, y=df_2["Value"], title='Value in Millions for the year')
fig2.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig2.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig2.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig2.show()
df_3 = df_pos.groupby(df['ProductCategory_ID']).sum()
df_3 = df_3.sort_values(by="Units", ascending=False).head(10)
df_3
| ProductFamily_ID | ProductCategory_ID | Point-of-Sale_ID | Value | Quarter | Units | |
|---|---|---|---|---|---|---|
| ProductCategory_ID | ||||||
| 178 | 59184 | 1493064 | 8388 | 4.506455e+07 | 25164 | 28157.0 |
| 175 | 11322 | 110075 | 629 | 5.528173e+06 | 1887 | 3041.0 |
| 22 | 5105 | 7722 | 351 | 1.130110e+06 | 1053 | 938.0 |
| 27 | 3346 | 6453 | 239 | 5.658950e+05 | 717 | 920.0 |
| 3 | 3705 | 741 | 247 | 5.244510e+05 | 741 | 659.0 |
| 34 | 4496 | 7582 | 223 | 1.122597e+06 | 669 | 543.0 |
| 2 | 1725 | 230 | 115 | 3.629660e+05 | 345 | 509.0 |
| 11 | 2708 | 1683 | 153 | 2.968320e+05 | 459 | 389.0 |
| 28 | 952 | 1904 | 68 | 1.966400e+05 | 204 | 312.0 |
| 149 | 2856 | 20264 | 136 | 6.489800e+05 | 408 | 300.0 |
fig = px.pie(df_3, values=df_3["Units"], names=df_3.index)
fig.show()
fig = px.treemap(df_3, path=['ProductFamily_ID', 'ProductCategory_ID'], values='ProductCategory_ID')
fig.show()